In [1]:
from IPython.display import FileLink
FileLink('Titanic baby step for pandas Part 1.ipynb')





In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as plt
from IPython.display import display_html

In [3]:
df = pd.read_csv('data/train.csv')
df.head()


Out[3]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35 0 0 373450 8.0500 NaN S

In [4]:
df.describe()


Out[4]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200

fill the empty age with mean value


In [5]:
df['AgeFixed'] = df['Age'].fillna(df['Age'].mean())

Take the salutation part


In [6]:
df['Salutation'] = df['Name'].apply(lambda x: x.split(', ')[1].split('.')[0])
df.head()


Out[6]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked AgeFixed Salutation
0 1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 NaN S 22 Mr
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C85 C 38 Mrs
2 3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250 NaN S 26 Miss
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1000 C123 S 35 Mrs
4 5 0 3 Allen, Mr. William Henry male 35 0 0 373450 8.0500 NaN S 35 Mr

here is how salutation looks like


In [7]:
df.groupby('Salutation')['PassengerId'].count()


Out[7]:
Salutation
Capt              1
Col               2
Don               1
Dr                7
Jonkheer          1
Lady              1
Major             2
Master           40
Miss            182
Mlle              2
Mme               1
Mr              517
Mrs             125
Ms                1
Rev               6
Sir               1
the Countess      1
Name: PassengerId, dtype: int64

remove the Capt / Don part


In [8]:
s = df.groupby('Salutation')['PassengerId'].count()
s.sort(ascending=False)
valid_vals = s[:4].index
df['SalutationNew'] = df['Salutation'].apply(lambda x: x in valid_vals and x or 'Others')
df.boxplot('Age', by='SalutationNew', figsize=(20, 4))


Out[8]:
<matplotlib.axes.AxesSubplot at 0x7f07e481db10>

SalutationNew 和 Age 关系比较显著


In [9]:
df.boxplot('Age', by=['Sex', 'Pclass'], figsize=(20, 4))


Out[9]:
<matplotlib.axes.AxesSubplot at 0x7f07e47c99d0>

Sex/Pclass 和 Age 的关系也比较显著 所以可以用 Sex/Pclass/SalutationNew 这三个变量矫正缺失值


In [10]:
# 翻转矩阵 算出 Sex/Pclass 下每个 SalutationNew 的 Age 中位数值 用来填充缺失 Age
t = df.pivot_table(values='Age', index='SalutationNew', columns=['Sex', 'Pclass'], aggfunc=np.median)
display_html(t)
df['AgeFixed'] = df['Age']
df['AgeFixed'].fillna(df[df['Age'].isnull()].apply(lambda x: t[x['Sex']][x['Pclass']][x['SalutationNew']], axis=1), inplace=True)
df.describe()


Sex female male
Pclass 1 2 3 1 2 3
SalutationNew
Master NaN NaN NaN 4 1.0 4
Miss 30.0 24 18 NaN NaN NaN
Mr NaN NaN NaN 40 31.0 26
Mrs 41.5 32 31 NaN NaN NaN
Others 28.5 28 NaN 49 46.5 NaN
Out[10]:
PassengerId Survived Pclass Age SibSp Parch Fare AgeFixed
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208 29.149461
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429 13.503296
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000 0.420000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400 21.000000
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200 26.000000
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000 36.750000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200 80.000000

找出异常的 Fare 数据并用正常临界值代替掉


In [11]:
fig = plt.pyplot.figure(figsize=(30, 4))
ax = fig.add_subplot(121)
ax.set_title('before fix')

s = df['Fare'].copy()
s.hist()
n = 4
outhigh_fare = s[s - s.mean() >= n * s.std()]
outlow_fare = s[s.mean() - s >= n * s.std()]
if outhigh_fare.count():
    s[s >= outhigh_fare.max()] = s[s < outhigh_fare.min()].max()
if outlow_fare.count():
    s[s <= outhigh_fare.min()] = s[s > outhigh_fare.max()].min()

ax = fig.add_subplot(122)
ax.set_title('after fix')
s.hist()

# ax = fig.add_subplot(133)
# ax.set_title('after fix')
df['FareFixed'] = s
df.boxplot(['Fare', 'FareFixed'], by='Pclass', figsize=(30, 4))


Out[11]:
array([<matplotlib.axes.AxesSubplot object at 0x7f07e4341c90>,
       <matplotlib.axes.AxesSubplot object at 0x7f07e430de90>], dtype=object)

In [11]: